home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power Programmierung
/
Power-Programmierung CD 2 (Tewi)(1994).iso
/
doc
/
mir
/
f_print.c
< prev
next >
Wrap
Text File
|
1992-07-02
|
13KB
|
383 lines
/*
* usage: f_print file_name [/a][/w] [ from_byte to_byte ] > subset
*
* F_PRINT Reduces a file to printable characters only. If the /w
* option is specified, strings of printable characters that
* are unlikely to be words are filtered out as well, and
* each new burst of accepted text is placed on a new line.
* /a causes accented characters to be accepted as printable.
*
* input: Any file whatsoever, or any part of a file.
*
* output: Printable subset.
*
* writeup: MIR TUTORIAL ONE, topic 5
*
* Written: Douglas Lowry Jan 07 92
* Modified: Douglas Lowry Feb 27 92
* Copyright (C) 1992 Marpex Inc.
*
* The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
* usage and co-ordination of the MIR family of programs to analyze,
* prepare and index databases (small through gigabyte size), and
* how to build integrated retrieval software around the MIR search
* engine. The fifth of the five MIR tutorial series explains how
* to extend indexing capability into leading edge search-related
* technologies. For more information, GO IBMPRO on CompuServe;
* MIR files are in the DBMS library. The same files are on the
* Canada Remote Systems BBS. A diskette copy of the Introduction
* is available by mail ($10 US... check, Visa or Mastercard);
* diskettes with Introduction, Tutorial ONE software and the
* shareware Tutorial ONE text cost $29. Shareware registration
* for a tutorial is also $29.
*
* E-mail...
* Compuserve 71431,1337
* Internet doug.lowry%canrem.com
* UUCP canrem!doug.lowry
* Others: doug.lowry@canrem.uucp
*
* FAX... 416 963-5677
*
* "Snail mail"... Douglas Lowry, Ph.D.
* Marpex Inc.
* 5334 Yonge Street, #1102
* North York, Ontario
* Canada M2N 6M2
*
* Related database consultation and preparation services are
* available through:
* Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
* North York, Ontario Canada M2J 4Z7
* Tel. 416 492-3838 FAX 416 492-3843
*
* This program is free software; you may redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* (file 05LICENS) along with this program; if not, write to the
* Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
* USA.
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#define BIGBUF 2048
#define STORE 32
#define NON_PRINT 0
#define WHITE_SPACE 1
#define PUNCTUATION 2
#define DIGIT 3
#define CONSONANT 4
#define VOWEL 5
#define HI_CONSONANT 6
#define HI_VOWEL 7
#define TYPE_CT 8 /* count of above types */
#define repeat for(;;)
/*
* declarations
*/
typedef enum _bool
{ FALSE = 0, TRUE = 1 } Bool;
void Usage_(), process(), clear_store() ;
Bool check_store() ;
char *Cmdname_() { return( "f_print" ); }
/*
* GLOBAL VARIABLES
*/
static unsigned char table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* ctls */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* ctls */
/* bl ! " # $ % & ' ( ) * + , - . / */
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2,
/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,
/* @ A B C D E F G H I J K L M N O */
4, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5,
/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 2, 2, 2, 2, 2,
/* ` a b c d e f g h i j k l m n o */
2, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5,
/* p q r s t u v w x y z { | } ~ NULL */
4, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4, 2, 2, 2, 2, 0,
/* Ç ü é â ä à å ç ê ë è ï î ì Ä Å */
6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7,
/* É æ Æ ô ö ò û ù ÿ Ö Ü ¢ £ ¥ ₧ ƒ */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0,
/* á í ó ú ñ Ñ ª º ¿ ⌐ ¬ ½ ¼ ¡ « » */
7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/*
* MAIN
*/
main( argc, argv )
int argc;
char **argv;
{
FILE *fp ;
char c10 ;
Bool words,
accent_ok, /* allow accented chars */
got_from ; /* found a "from_byte" argument */
int i ;
long fr_byte, to_byte; /* byte range */
/* usage: f_print file_name [/a][/w] [ from_byte to_byte ] */
c10 = argv[1][0] ;
if( argc < 2 || argc > 6 || c10 == '-' || c10 == '/' || c10 == '?' )
Usage_() ;
if(( fp = fopen( argv[1], "rb" )) == NULL )
{
fprintf( stderr, "\nUnable to open file %s.\n", argv[1] );
Usage_();
}
words = got_from = accent_ok = FALSE ;
fr_byte = 0 ;
to_byte = 0x0fffffff ;
for( i = 2 ; i < argc ; i++ )
{
if( islower( argv[i][1] ))
argv[i][1] = toupper( argv[i][1] ) ;
c10 = argv[i][0] ;
if(( c10 == '-' || c10 == '/' ) && argv[i][1] == 'W' )
words = TRUE ;
else if(( c10 == '-' || c10 == '/' ) && argv[i][1] == 'A' )
accent_ok = TRUE ;
else if( got_from )
to_byte = atol( argv[i] ) ;
else
{
fr_byte = atol( argv[i] );
got_from = TRUE ;
}
}
if( fr_byte )
{
if( fseek( fp, fr_byte, SEEK_SET ))
{
fprintf( stderr, "Unable to position %s to %ld\n",
argv[1], fr_byte );
Usage_() ;
}
}
process( fp, fr_byte, to_byte, accent_ok, words ) ;
fclose( fp );
exit( 0 );
}
/*
* Usage
*/
void
Usage_()
{
fprintf( stderr,
"\nUsage: %s file_name [/a][/w] [ from_byte to_byte ] > subset\n\n\
Reduces a file to printable characters only. If the /w\n\
option is specified, strings of printable characters that\n\
are unlikely to be words are filtered out as well, and\n",
Cmdname_() ) ;
fprintf( stderr,
" each new burst of accepted text is placed on a new line.\n\
/a causes accented characters to be accepted as printable.\n\n\
input: Any file whatsoever, or any part of a file.\n\n\
output: Printable subset.\n\n\
writeup: MIR TUTORIAL ONE, topic 5\n\n" ) ;
exit( 1 ) ;
}
/*
* PROCESS - Passes through file from starting position,
* filtering out unprintable material.
*/
void
process( fp, fr_byte, to_byte, accent_ok, words )
FILE *fp ;
long int fr_byte, /* beginning offset */
to_byte ; /* ending offset */
Bool accent_ok,
words ;
{
/* The technique implemented below tests only the first STORE
* characters of a printable sequence. Once this limit is reached,
* we assume full printability until a NON_PRINT character is found.
* The function check_store controls the criteria for whether the
* start of a printable sequence passes.
*/
unsigned char buffer[ BIGBUF ],
store[ STORE ],
uc ;
long int offset, /* cumulative bytes into file */
up_to ; /* test one beyond "to_byte" */
int buflen, /* of buffer contents */
in_store, /* consecutive bytes in store */
stor_typ[TYPE_CT], /* count each type in store */
type, /* of character per table above */
prev_type, /* previous type */
i, j, pt ;
offset = fr_byte ;
up_to = to_byte + 1 ;
prev_type = type = NON_PRINT ;
clear_store( &in_store, stor_typ ) ;
repeat
{
if( offset++ > up_to )
break ;
buflen = fread( buffer, sizeof( char ), BIGBUF, fp );
if( !buflen )
break ;
for( pt = 0 ; pt < buflen ; pt ++ )
{
offset++ ;
if( offset > up_to )
break ;
uc = buffer[ pt ] ;
prev_type = type ;
type = table[ uc ] ;
if( !accent_ok && ( type == HI_CONSONANT || type == HI_VOWEL ))
type = NON_PRINT ;
/* Two accented characters in sequence are not printable */
if(( prev_type == HI_CONSONANT || prev_type == HI_VOWEL )
&& ( type == HI_CONSONANT || type == HI_VOWEL ))
{
type = NON_PRINT ;
in_store-- ;
}
if( type == NON_PRINT )
{
if( words && in_store == STORE )
putchar( '\n' );
else if( words && in_store &&
check_store( in_store, stor_typ ))
{
for( i = 0 ; i < in_store ; i++ )
{
if( putchar( store[i] ) != store[i] )
{
fprintf( stderr,
"Unable to write... FATAL.\n\n" );
exit( 1 );
}
}
putchar( '\n' );
}
if( in_store )
clear_store( &in_store, stor_typ ) ;
continue ;
}
/* printable characters - output or add to store */
if( !words || in_store == STORE )
{
if( putchar( uc ) != uc )
{
fprintf( stderr, "Unable to write... FATAL.\n\n" );
exit( 1 );
}
}
else
{
store[ in_store++ ] = uc ;
stor_typ[ type ]++ ;
if( in_store == STORE )
{
if( !check_store( in_store, stor_typ ))
clear_store( &in_store, stor_typ ) ;
else
{
for( i = 0 ; i < in_store ; i++ )
{
if( putchar( store[i] ) != store[i] )
{
fprintf( stderr,
"Unable to write... FATAL.\n\n" );
exit( 1 );
}
}
}
}
}
}
}
return;
}
/*
* CLEAR_STORE
*/
void
clear_store( in_store, stor_typ )
int *in_store, /* consecutive bytes in store */
stor_typ[ TYPE_CT ]; /* count each type in store */
{
int i ;
*in_store = 0 ;
for( i = 1 ; i < TYPE_CT ; i++ )
stor_typ[ i ] = 0 ;
return ;
}
/*
* CHECK_STORE - Is the series held in "store" valid "words"?
*
* In the version that follows, a series passes if it contains 5 or
* more bytes made up of:
* 1. digits with NO vowels AND NO consonants
* 2. vowels AND consonants (with or without digits)
*
* You may wish to try alternative forms of this function. Its objective
* is to maximize retention of desired text while minimizing retention of
* junk. Proximity might be considered... more than 4 consonants in a
* row, no vowels between white spaces, etc.
*/
Bool
check_store( in_store, typ )
int in_store,
typ[ TYPE_CT ]; /* count each type in store */
{
if( in_store < 5 )
return( FALSE );
typ[ CONSONANT ] += typ[ HI_CONSONANT ] ;
typ[ VOWEL ] += typ[ HI_VOWEL ];
if( typ[ DIGIT ] && !typ[ CONSONANT ] && !typ[ VOWEL ] )
return( TRUE ) ;
if( typ[ CONSONANT ] && typ[ VOWEL ] )
return( TRUE ) ;
return( FALSE ) ;
}